import os
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
# from langchain_community.text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 设置OpenAI API密钥
os.environ["OPENAI_API_KEY"] = "sk-w0P1gNv42gN6suR8819858Ee297c4b7cBc97Ae3e7b8dA701"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"

# 加载文档
text_loader = TextLoader('C:/workspace/gitee/huiyi_pro/ai-interview-backend-demo/data/西游记.txt', encoding="UTF-8")
text_loader = text_loader.load()

# 分词
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=40)
splits = text_splitter.split_documents(text_loader)

# 获取文档的embedding
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

# 创建ChromaDB并写入数据
chroma_vector_database = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory='./mychroma_db')

